This notebook examines the relationship between different features of the data for distinguishing viral from nonviral sequences.

Please reach out to James Riddell () or Bridget Hegarty () regarding any issues, or open an issue on github.

library(ggplot2)
library(plyr)
library(reshape2)
library(viridis)
Loading required package: viridisLite
library(tidyr)

Attaching package: ‘tidyr’

The following object is masked from ‘package:reshape2’:

    smiths
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:plyr’:

    arrange, count, desc, failwith, id, mutate, rename, summarise, summarize

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(readr)
library(data.table)
data.table 1.14.0 using 1 threads (see ?getDTthreads).  Latest news: r-datatable.com
**********
This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
**********

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last

The following objects are masked from ‘package:reshape2’:

    dcast, melt
viruses <- read_tsv("../IntermediaryFiles/viral_tools_combined.tsv")

── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_double(),
  seqtype = col_character(),
  contig = col_character(),
  checkv_provirus = col_character(),
  checkv_quality = col_character(),
  method.x = col_character(),
  Classified = col_character(),
  IDs_all = col_character(),
  Seq = col_character(),
  Kaiju_Viral = col_character(),
  Kingdom = col_character(),
  type = col_character(),
  vibrant_quality = col_character(),
  method.y = col_character(),
  vibrant_prophage = col_character(),
  vs2type = col_character(),
  max_score_group = col_character()
)
ℹ Use `spec()` for the full column specifications.
colnames(viruses)
 [1] "Index"                "seqtype"              "contig"               "checkv_provirus"      "checkv_completeness"  "checkv_contamination"
 [7] "checkv_viral_genes"   "checkv_host_genes"    "checkv_total_genes"   "checkv_length"        "checkv_quality"       "method.x"            
[13] "Classified"           "NCBI_taxon"           "len"                  "ID_best"              "IDs_all"              "Seq"                 
[19] "Kaiju_Viral"          "Kingdom"              "score"                "pvalue"               "bh_pvalue"            "type"                
[25] "vibrant_quality"      "method.y"             "vibrant_prophage"     "category"             "vs2type"              "dsDNAphage"          
[31] "ssDNA"                "NCLDV"                "RNA"                  "lavidaviridae"        "max_score"            "max_score_group"     
[37] "hallmark"             "viral"                "cellular"             "percent_host"         "percent_viral"        "percent_unknown"     
There were 25 warnings (use warnings() to see them)
ggplot(viruses, aes(x=NCLDV, y=viral)) +
There were 50 or more warnings (use warnings() to see the first 50)
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~confusion_matrix_high_MCC, scales = "free") +
  xlab("NCLDV VS2 Score") +
  ylab("VS2 Viral Score")

Important features by sequence type

pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")

ggplot(viruses, aes(x=checkv_host_genes, y=checkv_viral_genes)) +
There were 34 warnings (use warnings() to see them)
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Number of Host Genes") +
  ylab("Number of Viral Genes")

ggplot(viruses, aes(x=hallmark, y=checkv_length)) +
There were 32 warnings (use warnings() to see them)
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Number of Hallmark Genes") +
  ylab("Length of Sequence") 

table(viruses$checkv_host_genes>=50, viruses$seqtype)
       
        archaea bacteria fungi plasmid protist virus
  FALSE    7542    47439  1146    3938    4993 10000
  TRUE     2399    17278   497    1052       7     0
ggplot(viruses, aes(x=checkv_length, y=checkv_completeness)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Length") +
  ylab("Completeness") 

ggplot(viruses, aes(x=hallmark, y=checkv_completeness)) +
There were 30 warnings (use warnings() to see them)
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Hallmark Genes") +
  ylab("Completeness") 

table(viruses$seqtype[viruses$checkv_length>50000 & viruses$hallmark==0])/table(viruses$seqtype)

  archaea  bacteria     fungi   plasmid   protist     virus 
0.3314556 0.2962900 0.5605600 0.4841683 0.0168000 0.0184000 
table(viruses$seqtype[((viruses$checkv_viral_genes*3) <= viruses$checkv_host_genes) & viruses$checkv_provirus=="No"])/table(viruses$seqtype)

  archaea  bacteria     fungi   plasmid   protist     virus 
0.9842068 0.8948808 0.9263542 0.8298597 0.7198000 0.0210000 
table(viruses$seqtype[viruses$checkv_viral_genes==0 & viruses$checkv_host_genes>=1])/table(viruses$seqtype)

  archaea  bacteria     fungi   plasmid   protist     virus 
0.8565537 0.6711065 0.2422398 0.4166333 0.0306000 0.0057000 
table(viruses$seqtype[viruses$percent_viral>=50])/table(viruses$seqtype)

    archaea    bacteria       fungi     plasmid     protist       virus 
0.000201187 0.027797951 0.006695070 0.004208417 0.053200000 0.306600000 
table(viruses$seqtype[viruses$percent_unknown>=75])/table(viruses$seqtype)

   archaea   bacteria      fungi    plasmid    protist      virus 
0.11628609 0.08322388 0.82349361 0.45511022 0.85360000 0.39280000 
table(viruses$seqtype[viruses$percent_unknown>=75 & viruses$checkv_length<50000])/table(viruses$seqtype)

   archaea   bacteria      fungi    plasmid    protist      virus 
0.10552258 0.08021076 0.34814364 0.25430862 0.83540000 0.25300000 
table(viruses$seqtype[viruses$hallmark>2])/table(viruses$seqtype[viruses$seqtype %in% unique(viruses$seqtype[viruses$hallmark>2])])

    archaea    bacteria     plasmid       virus 
0.008952822 0.045428558 0.055711423 0.576000000 
table(viruses$seqtype, viruses$Kaiju_Viral)
seqdata <- data.frame(seqtype=viruses$seqtype[!duplicated(viruses$contig)])

rownames(seqdata) <- viruses$contig[!duplicated(viruses$contig)]
library(phyloseq)
features_table <- viruses[viruses$Index==1,]
features_table <- features_table[,colnames(features_table) %in% c( 
                                              "checkv_viral_genes",
                                              "checkv_host_genes",
                                              "checkv_unknown_genes",
                                              "checkv_length",
                                              "checkv_completeness",
                                              "checkv_total_genes",
                                              "percent_host",
                                              "percent_viral",
                                              "hallmark",
                                              "percent_unknown"
                                              )]

features_table[is.na(features_table)] <- 0
ft_colnames <- colnames(features_table)
features_table <- t(features_table)
rownames(features_table) <- ft_colnames
colnames(features_table) <- rownames(seqdata)

physeq_pooled <- phyloseq(otu_table(features_table, taxa_are_rows = T))
ordination <- phyloseq::ordinate(physeq =physeq_pooled, method = "PCoA", distance = "bray")
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
                          shape="numtools", color="num_viruses") + 
  geom_point(size = 3) +
  theme_bw() +
  geom_label(label=seqdata$toolcombo)

phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
                          shape="numtools", color="num_viruses") + 
  geom_point(size = 3) +
  theme_bw()

Viral Addition Tuning Rules

viruses_sankey_tv <- data.frame(seqtype=viruses$seqtype,
There were 12 warnings (use warnings() to see them)
                             kj_cel=rep(0,nrow(viruses)),
                             hall=rep(0,nrow(viruses)),
                             pv=rep(0,nrow(viruses)),
                             cvl_pu=rep(0, nrow(viruses)))
viruses_sankey_tv$kj_cel[viruses$Kaiju_Viral=="Viruses"] <- 0.5
viruses_sankey_tv$hall[viruses$hallmark>2] <- 0.5
viruses_sankey_tv$pv[viruses$percent_viral>=50] <- 0.5
viruses_sankey_tv$cvl_pu[viruses$checkv_length>50000 & viruses$percent_unknown<=75] <- 0.5    

viruses_sankey_tv$all <- rowSums(viruses_sankey_tv[,2:5])
viruses_sankey_tv %>%
  count(seqtype, all) %>% spread(key = all, value=n)
viruses_sankey_tv <- viruses_sankey_tv %>%
  count(seqtype, kj_cel, hall, pv, cvl_pu, all) %>%
  mutate(viral_score=factor(all))
ggplot(viruses_sankey_tv,
       aes(axis1 = kj_cel, axis2 = hall, axis3 = pv, axis4 = cvl_pu, y=n)) +
  geom_alluvium(aes(fill=viral_score),
                width = 0, knot.pos = 0, reverse = FALSE) +
  geom_stratum(width = 1/5) +
  theme_bw() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)),
            reverse = FALSE) +
  theme(
        axis.text.x=element_text(size=14, angle = 90)
        ) +
  scale_x_continuous(breaks=c(1,2,3,4),
    labels=c("kaiju", "hallmark", "% viral", "% unknown")) +
  facet_wrap(~seqtype, scales="free_y") 

Viral Removal Tuning Rules

viruses_sankey_tnv <- data.frame(seqtype=viruses$seqtype,
                             kj_cel=rep(0,nrow(viruses)),
                             hg_pro=rep(0,nrow(viruses)),
                             vg_hg=rep(0,nrow(viruses)),
                             vg_hg_pro=rep(0, nrow(viruses)),
                             cvl_hm=rep(0, nrow(viruses)),
                             cvl_cp=rep(0,nrow(viruses)))
viruses_sankey_tnv$kj_cel[viruses$Kaiju_Viral=="cellular organisms"] <- -0.5
viruses_sankey_tnv$hg_pro[viruses$checkv_host_genes>50 & viruses$provirus==FALSE] <- -1
viruses_sankey_tnv$vg_hg[viruses$checkv_viral_genes==0 & viruses$checkv_host_genes>=1] <- -1
viruses_sankey_tnv$vg_hg_pro[((viruses$checkv_viral_genes*3) <= viruses$checkv_host_genes) & viruses$provirus==FALSE] <- -1
viruses_sankey_tnv$cvl_hm[viruses$checkv_length>500000 & viruses$hallmark<=1] <- -1
viruses_sankey_tnv$cvl_cp[viruses$checkv_length>5000 & viruses$checkv_completeness<=75] <- -0.5             
viruses_sankey_tnv <- viruses_sankey_tnv %>%
  count(seqtype, kj_cel, hg_pro, vg_hg, vg_hg_pro, cvl_hm, cvl_cp, all) %>%
  mutate(viral_score=factor(all))

---
title: "Testing Set Features Visualization"
output: html_notebook
---

This notebook examines the relationship between different features of the data for distinguishing viral from nonviral sequences.

Please reach out to James Riddell (riddell.26@buckeyemail.osu.edu) or
Bridget Hegarty (beh53@case.edu) regarding any issues, or open an issue on github.

```{r setup-library}
library(ggplot2)
library(plyr)
library(reshape2)
library(viridis)
library(tidyr)
library(dplyr)
library(readr)
library(data.table)
```

```{r}
viruses <- read_tsv("../IntermediaryFiles/viral_tools_combined.tsv")
```

```{r}
colnames(viruses)
```

```{r}
ggplot(viruses, aes(x=RNA, y=viral)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("RNA VS2 Score") +
  ylab("VS2 Viral Score")

ggplot(viruses, aes(x=RNA, y=viral)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~confusion_matrix_high_MCC, scales = "free") +
  xlab("RNA VS2 Score") +
  ylab("VS2 Viral Score")

ggplot(viruses, aes(x=NCLDV, y=viral)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("NCLDV VS2 Score") +
  ylab("VS2 Viral Score")

ggplot(viruses, aes(x=NCLDV, y=viral)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~confusion_matrix_high_MCC, scales = "free") +
  xlab("NCLDV VS2 Score") +
  ylab("VS2 Viral Score")

ggplot(viruses, aes(x=lavidaviridae, y=viral)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Lavidaviridae VS2 Score") +
  ylab("VS2 Viral Score")

ggplot(viruses, aes(x=lavidaviridae, y=viral)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~confusion_matrix_high_MCC, scales = "free") +
  xlab("Lavidaviridae VS2 Score") +
  ylab("VS2 Viral Score")
```

# Important features by sequence type

```{r}
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
```

```{r}
ggplot(viruses, aes(x=hallmark, y=checkv_viral_genes)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Number of Hallmark Genes") +
  ylab("Number of Viral Genes")
```

```{r}
ggplot(viruses, aes(x=checkv_host_genes, y=checkv_viral_genes)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Number of Host Genes") +
  ylab("Number of Viral Genes")
```

```{r}
ggplot(viruses, aes(x=percent_unknown, y=percent_viral)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Percentage of Genes Unknown") +
  ylab("Percentage of Genes Viral")
```

```{r}
ggplot(viruses, aes(x=percent_unknown, y=checkv_length)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Percentage of Genes Unknown") +
  ylab("Length of Sequence")
```

```{r}
ggplot(viruses, aes(x=percent_viral, y=checkv_length)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Percentage of Genes Viral") +
  ylab("Length of Sequence") 
```

```{r}
ggplot(viruses, aes(x=hallmark, y=checkv_length)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Number of Hallmark Genes") +
  ylab("Length of Sequence") 
```

```{r}
ggplot(viruses, aes(x=checkv_host_genes, y=checkv_length)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Length of Sequence") +
  ylab("Number of Host Genes")
```
```{r}
table(viruses$checkv_host_genes>=50, viruses$seqtype)
```



```{r}
ggplot(viruses, aes(x=checkv_length, y=checkv_completeness)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Length") +
  ylab("Completeness") 
```

```{r}
ggplot(viruses, aes(x=hallmark, y=checkv_completeness)) +
  geom_hex(bins = 30) +
  scale_fill_continuous(type = "viridis", trans="log10") +
  theme_bw() +
  facet_wrap(~seqtype, scales = "free") +
  xlab("Hallmark Genes") +
  ylab("Completeness") 
```

```{r}
table(viruses$seqtype[viruses$checkv_length>50000 & viruses$hallmark==0])/table(viruses$seqtype)
table(viruses$seqtype[((viruses$checkv_viral_genes*3) <= viruses$checkv_host_genes) & viruses$checkv_provirus=="No"])/table(viruses$seqtype)
table(viruses$seqtype[viruses$checkv_viral_genes==0 & viruses$checkv_host_genes>=1])/table(viruses$seqtype)

table(viruses$seqtype[viruses$percent_viral>=50])/table(viruses$seqtype)
table(viruses$seqtype[viruses$percent_unknown>=75])/table(viruses$seqtype)
table(viruses$seqtype[viruses$percent_unknown>=75 & viruses$checkv_length<50000])/table(viruses$seqtype)
table(viruses$seqtype[viruses$hallmark>2])/table(viruses$seqtype[viruses$seqtype %in% unique(viruses$seqtype[viruses$hallmark>2])])
```

```{r}
table(viruses$seqtype, viruses$Kaiju_Viral)
```

```{r}
seqdata <- data.frame(seqtype=viruses$seqtype[!duplicated(viruses$contig)])

rownames(seqdata) <- viruses$contig[!duplicated(viruses$contig)]
```

```{r}
library(phyloseq)
```


```{r}
features_table <- viruses[viruses$Index==1,]
features_table <- features_table[,colnames(features_table) %in% c( 
                                              "checkv_viral_genes",
                                              "checkv_host_genes",
                                              "checkv_unknown_genes",
                                              "checkv_length",
                                              "checkv_completeness",
                                              "checkv_total_genes",
                                              "percent_host",
                                              "percent_viral",
                                              "hallmark",
                                              "percent_unknown"
                                              )]

features_table[is.na(features_table)] <- 0
ft_colnames <- colnames(features_table)
features_table <- t(features_table)
rownames(features_table) <- ft_colnames
colnames(features_table) <- rownames(seqdata)

physeq_pooled <- phyloseq(otu_table(features_table, taxa_are_rows = T))
```

```{r}
ordination <- phyloseq::ordinate(physeq =physeq_pooled, method = "PCoA", distance = "bray")
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
                          shape="numtools", color="num_viruses") + 
  geom_point(size = 3) +
  theme_bw() +
  geom_label(label=seqdata$toolcombo)

phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
                          shape="numtools", color="num_viruses") + 
  geom_point(size = 3) +
  theme_bw()
```

# Viral Addition Tuning Rules

```{r}
viruses_sankey_tv <- data.frame(seqtype=viruses$seqtype,
                             kj_cel=rep(0,nrow(viruses)),
                             hall=rep(0,nrow(viruses)),
                             pv=rep(0,nrow(viruses)),
                             cvl_pu=rep(0, nrow(viruses)))
```

```{r}
viruses_sankey_tv$kj_cel[viruses$Kaiju_Viral=="Viruses"] <- 0.5
viruses_sankey_tv$hall[viruses$hallmark>2] <- 0.5
viruses_sankey_tv$pv[viruses$percent_viral>=50] <- 0.5
viruses_sankey_tv$cvl_pu[viruses$checkv_length>50000 & viruses$percent_unknown<=75] <- 0.5    

viruses_sankey_tv$all <- rowSums(viruses_sankey_tv[,2:5])
```

```{r}
viruses_sankey_tv %>%
  count(seqtype, all) %>% spread(key = all, value=n)
```

```{r}
viruses_sankey_tv <- viruses_sankey_tv %>%
  count(seqtype, kj_cel, hall, pv, cvl_pu, all) %>%
  mutate(viral_score=factor(all))
```

```{r}
ggplot(viruses_sankey_tv,
       aes(axis1 = kj_cel, axis2 = hall, axis3 = pv, axis4 = cvl_pu, y=n)) +
  geom_alluvium(aes(fill=viral_score),
                width = 0, knot.pos = 0, reverse = FALSE) +
  geom_stratum(width = 1/5) +
  theme_bw() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)),
            reverse = FALSE) +
  theme(
        axis.text.x=element_text(size=14, angle = 90)
        ) +
  scale_x_continuous(breaks=c(1,2,3,4),
    labels=c("kaiju", "hallmark", "% viral", "% unknown")) +
  facet_wrap(~seqtype, scales="free_y") 
``` 

# Viral Removal Tuning Rules

```{r}
viruses_sankey_tnv <- data.frame(seqtype=viruses$seqtype,
                             kj_cel=rep(0,nrow(viruses)),
                             hg_pro=rep(0,nrow(viruses)),
                             vg_hg=rep(0,nrow(viruses)),
                             vg_hg_pro=rep(0, nrow(viruses)),
                             cvl_hm=rep(0, nrow(viruses)),
                             cvl_cp=rep(0,nrow(viruses)))
```

```{r}
viruses_sankey_tnv$kj_cel[viruses$Kaiju_Viral=="cellular organisms"] <- -0.5
viruses_sankey_tnv$hg_pro[viruses$checkv_host_genes>50 & viruses$provirus==FALSE] <- -1
viruses_sankey_tnv$vg_hg[viruses$checkv_viral_genes==0 & viruses$checkv_host_genes>=1] <- -1
viruses_sankey_tnv$vg_hg_pro[((viruses$checkv_viral_genes*3) <= viruses$checkv_host_genes) & viruses$provirus==FALSE] <- -1
viruses_sankey_tnv$cvl_hm[viruses$checkv_length>500000 & viruses$hallmark<=1] <- -1
viruses_sankey_tnv$cvl_cp[viruses$checkv_length>5000 & viruses$checkv_completeness<=75] <- -0.5             
viruses_sankey_tnv$all <- rowSums(viruses_sankey_tnv[,2:7])
```

```{r}
viruses_sankey_tnv %>%
  count(seqtype, all) %>% spread(key = all, value=n)
```


```{r}
viruses_sankey_tnv <- viruses_sankey_tnv %>%
  count(seqtype, kj_cel, hg_pro, vg_hg, vg_hg_pro, cvl_hm, cvl_cp, all) %>%
  mutate(viral_score=factor(all))
```

```{r}
ggplot(viruses_sankey_tnv,
       aes(axis1 = kj_cel, axis2 = hg_pro, axis3 = vg_hg, axis4 = cvl_hm, axis5=cvl_hm, axis6=cvl_cp, 
           y=n)) +
  geom_alluvium(aes(fill=viral_score),
                width = 0, knot.pos = 0, reverse = FALSE) +
  geom_stratum(width = 1/5) +
  theme_bw() +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)),
            reverse = FALSE) +
  theme(
        axis.text.x=element_text(size=14, angle = 90)
        ) +
  scale_x_continuous(breaks=c(1,2,3,4,5,6),
    labels=c("kaiju", "host genes", "viral and host", "provirus",
             "hallmark", "complete")) +
  facet_wrap(~seqtype, scales="free_y") 
``` 













